% use `texdoc biblatex` to get help
@inproceedings{dauphin2014identifying,
  title={Identifying and attacking the saddle point problem in high-dimensional non-convex optimization},
  author={Dauphin, Yann N and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Yoshua},
  booktitle={Advances in neural information processing systems},
  pages={2933--2941},
  year={2014}
}

@article{bertsekas1997nonlinear,
  title={Nonlinear programming},
  author={Bertsekas, Dimitri P},
  journal={Journal of the Operational Research Society},
  volume={48},
  number={3},
  pages={334--334},
  year={1997},
  publisher={Taylor \& Francis}
}

@article{sun2019optimization,
  title={Optimization for deep learning: theory and algorithms},
  author={Sun, Ruoyu},
  journal={arXiv preprint arXiv:1912.08957},
  year={2019}
}

@article{shamir2018exponential,
  title={Exponential convergence time of gradient descent for one-dimensional deep linear neural networks},
  author={Shamir, Ohad},
  journal={arXiv preprint arXiv:1809.08587},
  year={2018}
}

@article{bray2007statistics,
  title={Statistics of critical points of Gaussian fields on large-dimensional spaces},
  author={Bray, Alan J and Dean, David S},
  journal={Physical review letters},
  volume={98},
  number={15},
  pages={150201},
  year={2007},
  publisher={APS}
}

@article{mishkin2015all,
  title={All you need is a good init},
  author={Mishkin, Dmytro and Matas, Jiri},
  journal={arXiv preprint arXiv:1511.06422},
  year={2015}
}

@article{romero2014fitnets,
  title={Fitnets: Hints for thin deep nets},
  author={Romero, Adriana and Ballas, Nicolas and Kahou, Samira Ebrahimi and Chassang, Antoine and Gatta, Carlo and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1412.6550},
  year={2014}
}

@inproceedings{glorot2010understanding,
  title={Understanding the difficulty of training deep feedforward neural networks},
  author={Glorot, Xavier and Bengio, Yoshua},
  booktitle={Proceedings of the thirteenth international conference on artificial intelligence and statistics},
  pages={249--256},
  year={2010}
}

@inproceedings{jia2014caffe,
  title={Caffe: Convolutional architecture for fast feature embedding},
  author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
  booktitle={Proceedings of the 22nd ACM international conference on Multimedia},
  pages={675--678},
  year={2014}
}

@inproceedings{he2015delving,
  title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={1026--1034},
  year={2015}
}

@article{saxe2013exact,
  title={Exact solutions to the nonlinear dynamics of learning in deep linear neural networks},
  author={Saxe, Andrew M and McClelland, James L and Ganguli, Surya},
  journal={arXiv preprint arXiv:1312.6120},
  year={2013}
}

@article{qian1999momentum,
  title={On the momentum term in gradient descent learning algorithms},
  author={Qian, Ning},
  journal={Neural networks},
  volume={12},
  number={1},
  pages={145--151},
  year={1999},
  publisher={Elsevier}
}

@article{burden2010numerical,
  title={Numerical analysis},
  author={Burden, Richard L and Faires, Douglas J},
  year={2010},
  publisher={Richard Stratton}
}

@inproceedings{nesterov1983method,
  title={A method for unconstrained convex minimization problem with the rate of convergence O (1/k\^{} 2)},
  author={Nesterov, Yurii},
  booktitle={Doklady an ussr},
  volume={269},
  pages={543--547},
  year={1983}
}

@book{sutskever2013training,
  title={Training recurrent neural networks},
  author={Sutskever, Ilya},
  year={2013},
  publisher={University of Toronto Toronto, Ontario, Canada}
}

@Misc{momentum,
howpublished = {\url{https://www.willamette.edu/~gorr/classes/cs449/momrate.html}},
note = {Accessed Feburary 20, 2020},
title = {Momentum and Learning Rate Adaptation},
author = {Genevieve B. Orr}
}

@Misc{rmsprop,
howpublished = {\url{http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf}},
note = {Accessed Feburary 21, 2020},
title = {Overview of Mini-­batch Gradient Descent},
author = {Geoff Hinton}
}

@article{duchi2011adaptive,
  title={Adaptive subgradient methods for online learning and stochastic optimization},
  author={Duchi, John and Hazan, Elad and Singer, Yoram},
  journal={Journal of machine learning research},
  volume={12},
  number={Jul},
  pages={2121--2159},
  year={2011}
}

@article{zeiler2012adadelta,
  title={Adadelta: an adaptive learning rate method},
  author={Zeiler, Matthew D},
  journal={arXiv preprint arXiv:1212.5701},
  year={2012}
}

@article{kingma2014adam,
  title={Adam: A method for stochastic optimization},
  author={Kingma, Diederik P and Ba, Jimmy},
  journal={arXiv preprint arXiv:1412.6980},
  year={2014}
}

@article{dozat2016incorporating,
  title={Incorporating nesterov momentum into adam},
  author={Dozat, Timothy},
  year={2016}
}

@techreport{poggio2016theory,
  title={Theory i: Why and when can deep networks avoid the curse of dimensionality?},
  author={Poggio, Tomaso and Mhaskar, Hrushikesh and Rosasco, Lorenzo and Miranda, Brando and Liao, Qianli},
  year={2016},
  institution={Center for Brains, Minds and Machines (CBMM), arXiv}
}

@phdthesis{poggio2017theory,
  title={Theory II: Landscape of the empirical risk in deep learning},
  author={Poggio, Tomaso and Liao, Qianli},
  year={2017},
  school={Center for Brains, Minds and Machines (CBMM), arXiv}
}

@article{banburski2019theory,
  title={Theory III: Dynamics and generalization in deep networks},
  author={Banburski, Andrzej and Liao, Qianli and Miranda, Brando and Rosasco, Lorenzo and Liang, Bob and Hidary, Jack and Poggio, Tomaso},
  journal={arXiv preprint arXiv:1903.04991},
  year={2019}
}

@article{mhaskar1996neural,
  title={Neural networks for optimal approximation of smooth and analytic functions},
  author={Mhaskar, Hrushikesh N},
  journal={Neural computation},
  volume={8},
  number={1},
  pages={164--177},
  year={1996},
  publisher={MIT Press}
}

@Misc{bezout,
howpublished = {\url{http://math.mit.edu/~lguth/PolyMethod/lect13.pdf}},
note = {Accessed Feburary 22, 2020},
title = {Bezout Theorem},
author = {MIT}
}

@Misc{hyperbolic,
howpublished = {\url{https://wikivisually.com/wiki/Hyperbolic_equilibrium_point}},
note = {Accessed Feburary 23, 2020},
title = {Hyperbolic Equilibrium Point},
author = {Wikipedia}
}

@Misc{mnist,
howpublished = {\url{http://yann.lecun.com/exdb/mnist/}},
note = {Accessed Feburary 28, 2020},
title = {THE MNIST DATABASE of handwritten digits},
author = {Yann LeCun}
}
